OBJECTIVE : >The purpose of this model is to analyse the pattern and spread of the COVID-19 from January 2020 onwards. A variety of packages were used for this exercise.
#install.packages("kableExtra")
suppressMessages(library(magrittr)) # pipe operations
suppressMessages(library(lubridate)) # date operations
suppressMessages(library(tidyverse)) # ggplot2, tidyr, dplyr...
suppressMessages(library(gridExtra)) # multiple grid-based plots on a page
suppressMessages(library(ggforce)) # accelerating ggplot2
suppressMessages(library(kableExtra)) # complex tables
suppressMessages(library(leaflet)) #for map
suppressMessages(library(plotly)) #plotly
Data Ingestion : >Reading data from the COVID-19 folder which gets updated everyday. It contains data for the whole world.
confirmed <- read.csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/cd5d1b9eacb96a2b598cb0e6fb2a3145978df4d0/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv")
death <- read.csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/cd5d1b9eacb96a2b598cb0e6fb2a3145978df4d0/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv")
recovered <- read.csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/cd5d1b9eacb96a2b598cb0e6fb2a3145978df4d0/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv")
Verifying the data and changing the data into the desirable format.
confirmed[1:10, 1:10]
## Province.State Country.Region Lat Long X1.22.20
## 1 Afghanistan 33.0000 65.0000 0
## 2 Albania 41.1533 20.1683 0
## 3 Algeria 28.0339 1.6596 0
## 4 Andorra 42.5063 1.5218 0
## 5 Angola -11.2027 17.8739 0
## 6 Antigua and Barbuda 17.0608 -61.7964 0
## 7 Argentina -38.4161 -63.6167 0
## 8 Armenia 40.0691 45.0382 0
## 9 Australian Capital Territory Australia -35.4735 149.0124 0
## 10 New South Wales Australia -33.8688 151.2093 0
## X1.23.20 X1.24.20 X1.25.20 X1.26.20 X1.27.20
## 1 0 0 0 0 0
## 2 0 0 0 0 0
## 3 0 0 0 0 0
## 4 0 0 0 0 0
## 5 0 0 0 0 0
## 6 0 0 0 0 0
## 7 0 0 0 0 0
## 8 0 0 0 0 0
## 9 0 0 0 0 0
## 10 0 0 0 3 4
col <- ncol(confirmed)
## get dates from column names
dates <- names(confirmed)[5:col] %>% substr(2,8) %>% mdy()
range(dates)
## [1] "2020-01-22" "2020-03-25"
## [1] "2020-01-22" "2020-03-22"
min_date <- min(dates)
max_date <- max(dates)
min_date_formt <- min_date %>% format('%d %b %Y')
max_date_formt <- max_date %>% format('%d %b %Y')
Data Cleaning , Manipulation and Visualisation was performed. We can see the bigger radius shows a few countried which have the highest number of people affected by COVID-19.
cleanData <- function(data) {
## remove some columns
data %<>% select(-c(Province.State, Lat, Long)) %>% rename(country=Country.Region)
## convert from wide to long format
data %<>% gather(key=date, value=count, -country)
## convert from character to date
data %<>% mutate(date = date %>% substr(2,8) %>% mdy())
## aggregate by country
data %<>% group_by(country, date) %>% summarise(count=sum(count, na.rm=T)) %>% as.data.frame()
return(data)
}
data_confirmed <- confirmed %>% cleanData() %>% rename(confirmed=count)
data_deaths <- death %>% cleanData() %>% rename(deaths=count)
data_recovered <- recovered %>% cleanData() %>% rename(recovered=count)
## merge above 3 datasets into one, by country and date
data <- data_confirmed %>% merge(data_deaths) %>% merge(data_recovered)
## countries/regions with confirmed cases, excl. cruise ships
countries <- data %>% pull(country) %>% setdiff('Cruise Ship')
## first 10 records when it first broke out in India
Ind <- data %>% filter(country=='India')
p <-ggplot(data= Ind, mapping = aes(x= date, y= confirmed)) + geom_bar(stat= "identity", fill = "#56B4E9")
#ggplotly(p)
p1 <- ggplot(data= Ind, mapping = aes(x= date, y= deaths)) + geom_bar(stat= "identity", fill = "#CC0000")
#ggplotly(p1)
p2 <- ggplot(data= Ind, mapping = aes(x= date, y= recovered)) + geom_bar(stat= "identity", fill = "#00FF00")
#ggplotly(p2)
subplot(p, p1, p2, margin = 0.1, nrows = 3, titleX = TRUE)
Visualizing data in the form of Map
## counts for the whole world
data_world <- data %>% group_by(date) %>%
summarise(country='World',
confirmed = sum(confirmed),
deaths = sum(deaths),
recovered = sum(recovered))
data %<>% rbind(data_world)
## current confirmed cases
data %<>% mutate(current_confirmed = confirmed - deaths - recovered)
## select last column, which is the number of latest confirmed cases
x <- confirmed
x$confirmed <- x[, ncol(x)]
x %<>% select(c(Country.Region, Province.State, Lat, Long, confirmed)) %>%
mutate(txt=paste0(Country.Region, ' - ', Province.State, ': ', confirmed))
m <- leaflet(width=1200, height=800) %>% addTiles()
# circle marker (units in pixels)
m %<>% addCircleMarkers(x$Long, x$Lat,
radius=2+log2(x$confirmed), stroke=F,
color='red', fillOpacity=0.3,
popup=x$txt)
# world
m